Some Helper Functions

initialize_values <- function(data, n, d, K) {
  set.seed(123)  # Ensure reproducibility
  
  # Initialize mixing proportions π (equal for all Gaussians)
  Pis <- rep(1/K, K)
  
  # Initialize means µ randomly chosen from the dataset
  Mus <- data[sample(1:n, K), , drop=FALSE]  # Select K random points
  
  # Initialize covariance matrices Σ with the overall covariance of the data
  Sigmas <- lapply(1:K, function(i) cov(data))  # Copy initial covariance for all clusters
  
  return(list(Pis = Pis, Mus = Mus, Sigmas = Sigmas))
}

convert_to_rgb <- function(k, segmented_img){
  my_colors <- rainbow(K)
  
  # Suppose segmented_img is the matrix of labels
  height <- nrow(segmented_img)
  width  <- ncol(segmented_img)
  
  # Create an empty 3D array for the colored image
  segmented_rgb <- array(0, dim = c(height, width, 3))
  
  # Fill in the array by mapping each cluster to its color
  for (k in seq_len(K)) {
    # Convert the k-th color to an (R,G,B) vector in [0,1]
    rgb_vals <- col2rgb(my_colors[k]) / 255
    
    # Where the segmentation == k, assign the color
    mask <- (segmented_img == k)
    segmented_rgb[,,1][mask] <- rgb_vals[1]  # R channel
    segmented_rgb[,,2][mask] <- rgb_vals[2]  # G channel
    segmented_rgb[,,3][mask] <- rgb_vals[3]  # B channel
  }
  return(segmented_rgb)
}
library(ggplot2)
library(mvtnorm)

generate_gaussian_mixture_data <- function(n, d, K, seed=42) {
  set.seed(seed)
  
  # Equal proportion
  Pis <- rep(1/K, K)  
  
  # Generate random means and covariance matrices for each Gaussian
  Mu <- lapply(1:K, function(i) runif(d, min = 0, max = 10))  # Random means in [0,10]
  Sigma <- lapply(1:K, function(i) {
    A <- matrix(runif(d*d, min=0.5, max=3), d, d)  # Random matrix
    return(t(A) %*% A)  # Ensures positive semi-definiteness
  })
  
  # Initialize data
  data <- matrix(0, n, d)
  z <- sample(1:K, n, replace=TRUE, prob=Pis)  # Assign clusters

  for (i in 1:n) {
    data[i, ] <- rmvnorm(1, Mu[[z[i]]], Sigma[[z[i]]])
  }
  
  return(list(data = data, z = z, Mu = Mu, Sigma = Sigma, Pis = Pis))
}
# Example usage
d = 2   # Dimensions: It's the number of values at each observation (data points)
K = 2   # Number of Gaussian components
n = 300 # Number of data points

data_info <- generate_gaussian_mixture_data(n, d, K, seed=2)
data <- data_info$data
Mus <- data_info$Mu
Sigmas <- data_info$Sigma
Pis <- data_info$Pis

z <- data_info$z

if (d == 1) {
  to.plot <- data.frame(x = data[,1], class = as.factor(z))
  ggplot(to.plot) + aes(x, fill = class) + geom_density(alpha = 0.5) +
    labs(title="1D Gaussian Mixture Density", x="X", y="Density") +
    theme_minimal()
}

if (d == 2) {
  to.plot = data.frame(x = data[,1], y = data[,2], class = as.factor(z))
  ggplot(to.plot) + aes(x, y, color = class) + geom_point() + geom_density_2d()
}

if (d == 3) {
  library(scatterplot3d)
  scatterplot3d(data, color = as.numeric(z), pch = 19, 
                main = "3D Gaussian Mixture",
                xlab = "X", ylab = "Y", zlab = "Z")
}

EM Algorithm Generalization

compute_gamma <- function(pi, phi) {
  gamma <- phi * pi
  gamma <- gamma / rowSums(gamma)
  return(gamma)
}

compute_l <- function(pi, gamma, phi) {
  # Generalized log-likelihood for K components
  return(sum(log(rowSums(phi * pi))))
}

EM_algorithm <- function(data, pi, mu, sigma, tol = 10^-2, verbose=FALSE) {
  
  K <- length(pi)  # Number of Gaussian components
  n <- nrow(data)  # Number of data points
  d <- ncol(data)  # Number of dimensions

  # Compute initial phi values for all Gaussians
  phi <- sapply(1:K, function(k) dmvnorm(data, mu[k,], sigma[[k]]))
  
  gamma <- compute_gamma(pi, phi)  # Compute probs of belonging

  L_temp <- compute_l(pi, gamma, phi)  # Compute initial log-likelihood
  if(verbose){
    cat("Initial log-likelihood: ", L_temp, "\n")
  }
  
  L <- 0
  iteration_counter <- 0
  
  while (abs(L - L_temp) >= tol) {
    L <- L_temp

    ## Expectation step ##
    gamma <- compute_gamma(pi, phi)

    ## Maximization step ##
    Nk <- colSums(gamma)  # Number of points assigned to each Gaussian. Sum the columns (prob of belonging of each data point)
    pi <- Nk / n  # Knowing how many are in each Gaussian, we can update the proportion
    
    # Update means
    mu <- t(gamma) %*% data / Nk  # The mean of each component is computed by the sum(prob_belonging_component*observation)/Num_of_points_belonging

    # Update covariance matrices
    sigma <- lapply(1:K, function(k) {
      X_centered <- sweep(data, 2, mu[k,])  # Center data
      sigma_k <- (t(X_centered) %*% (X_centered * gamma[, k])) / Nk[k]
      
      # --- REGULARIZE the covariance to prevent singularities ---
      epsilon <- 1e-6
      sigma_k <- sigma_k + diag(epsilon, ncol(data))
      
      return(sigma_k)
    })
    
    # Compute new phi values for all Gaussians
    phi <- sapply(1:K, function(k) dmvnorm(data, mu[k,], sigma[[k]]))
    
    # Compute new log-likelihood
    L_temp <- compute_l(pi, gamma, phi)
    iteration_counter <- iteration_counter + 1
    if(verbose){
      cat("Iteration: ", iteration_counter, " | Current log-likelihood: ", L_temp, "\n")
    }
  }
  
  return(list(n_iterations = iteration_counter, pi = pi, mu = mu, sigma = sigma, gamma = gamma))
}

# Run EM Algorithm
init_values <- initialize_values(data, n, d, K)
results <- EM_algorithm(data, init_values$Pis, init_values$Mus, init_values$Sigmas, tol = 10^-3)

cat("EM converged in", results$n_iterations, "iterations.\n")
## EM converged in 12 iterations.

NUEVOOOO

library(mvtnorm)
library(plotly)
## Warning: package 'plotly' was built under R version 4.4.3
## 
## Adjuntando el paquete: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
library(ellipse)  
## Warning: package 'ellipse' was built under R version 4.4.3
## 
## Adjuntando el paquete: 'ellipse'
## The following object is masked from 'package:graphics':
## 
##     pairs
# --- Modified EM_algorithm (Storing History for the First 8 Iterations) ---
compute_gamma <- function(pi, phi) {
  gamma <- phi * pi
  gamma <- gamma / rowSums(gamma)
  return(gamma)
}

compute_l <- function(pi, gamma, phi) {
  return(sum(log(rowSums(phi * pi))))
}

EM_algorithm <- function(data, pi, mu, sigma, tol = 1e-2, verbose = FALSE) {
  K <- length(pi)
  n <- nrow(data)
  d <- ncol(data)
  
  phi <- sapply(1:K, function(k) dmvnorm(data, mu[k,], sigma[[k]]))
  gamma <- compute_gamma(pi, phi)
  L_temp <- compute_l(pi, gamma, phi)
  if(verbose){ cat("Initial log-likelihood: ", L_temp, "\n") }
  
  L <- 0
  iteration_counter <- 0
  
  # Store histories for the first 8 iterations
  gamma_history <- list()
  mu_history <- list()
  sigma_history <- list()
  
  while (abs(L - L_temp) >= tol) {
    L <- L_temp
    gamma <- compute_gamma(pi, phi)
    if(iteration_counter < 8) {
      gamma_history[[iteration_counter + 1]] <- gamma
      mu_history[[iteration_counter + 1]] <- mu
      sigma_history[[iteration_counter + 1]] <- sigma
    }
    
    Nk <- colSums(gamma)
    pi <- Nk / n
    mu <- t(gamma) %*% data / Nk
    sigma <- lapply(1:K, function(k) {
      X_centered <- sweep(data, 2, mu[k,])
      sigma_k <- (t(X_centered) %*% (X_centered * gamma[, k])) / Nk[k]
      epsilon <- 1e-6
      sigma_k <- sigma_k + diag(epsilon, ncol(data))
      return(sigma_k)
    })
    
    phi <- sapply(1:K, function(k) dmvnorm(data, mu[k,], sigma[[k]]))
    L_temp <- compute_l(pi, gamma, phi)
    iteration_counter <- iteration_counter + 1
    if(verbose){
      cat("Iteration: ", iteration_counter, " | Current log-likelihood: ", L_temp, "\n")
    }
  }
  
  return(list(n_iterations = iteration_counter, pi = pi, mu = mu, sigma = sigma,
              gamma = gamma, gamma_history = gamma_history, mu_history = mu_history,
              sigma_history = sigma_history))
}

Visualizations

1D

# 1) Generate 1D data (K=2, n=300, etc.)
d <- 1
K <- 2
n <- 300
set.seed(563)
data_info_1d <- generate_gaussian_mixture_data(n, d, K)
data_1d <- data_info_1d$data  # shape: (n x 1)
true_labels_1d <- data_info_1d$z

# Quick look at the generated data
df_1d <- data.frame(
  x       = data_1d[,1],
  cluster = factor(true_labels_1d)
)

# Plot the histogram of the generated data (colored by the true cluster)
ggplot(df_1d, aes(x, fill=cluster)) +
  geom_histogram(alpha=0.5, position="identity", bins=30) +
  theme_minimal() +
  ggtitle("Generated 1D Gaussian Mixture")

# 2) Initialize parameters for EM
init_1d <- initialize_values(data_1d, nrow(data_1d), d, K)

# 3) Run EM algorithm with logging
res_1d <- EM_algorithm(
  data   = data_1d,
  pi     = init_1d$Pis,
  mu     = init_1d$Mus,
  sigma  = init_1d$Sigmas,
  tol    = 1e-4
)

cat("1D EM converged in", res_1d$n_iterations, "iterations.\n")
## 1D EM converged in 55 iterations.
# 5) Assign each data point to the most probable cluster
gamma_1d <- res_1d$gamma
assigned_1d <- apply(gamma_1d, 1, which.max)

# create a confusion matrix to assignate the correct label to each class
cm <- table(true_labels_1d, assigned_1d)
if (cm[1,1] + cm[2,2] < cm[1,2] + cm[2,1]) {
  assigned_1d <- ifelse(assigned_1d == 1, 2, 1)
}
df_1d$assigned <- factor(assigned_1d)

# 6) Visualize final clusters
ggplot(df_1d, aes(x, fill=assigned)) +
  geom_histogram(alpha=0.5, position="identity", bins=30) +
  theme_minimal() +
  ggtitle("1D Gaussian Mixture - EM assigned clusters")

2D

K = 2

# --- 2D Data Generation and Final Clustering Visualization --- #

# 1) Generate 2D data
d <- 2
K <- 2
n <- 300
set.seed(563)
data_info_2d <- generate_gaussian_mixture_data(n, d, K)
data_2d <- data_info_2d$data    # shape: (n x 2)
true_labels_2d <- data_info_2d$z

df_2d <- data.frame(
  x = data_2d[,1],
  y = data_2d[,2],
  cluster = factor(true_labels_2d)
)

# Plot the generated data (before EM)
ggplot(df_2d, aes(x, y, color = cluster)) +
  geom_point() +
  geom_density_2d() +
  theme_minimal() +
  ggtitle("Generated 2D Gaussian Mixture (True Labels)")

# 2) Initialize parameters
init_2d <- initialize_values(data_2d, nrow(data_2d), d, K)

# 3) Run EM Algorithm (storing history for first 8 iterations)
res_2d <- EM_algorithm(data_2d, init_2d$Pis, init_2d$Mus, init_2d$Sigmas, tol = 1e-4)
cat("2D EM converged in", res_2d$n_iterations, "iterations.\n")
## 2D EM converged in 8 iterations.
# 4) Final cluster assignment
gamma_2d <- res_2d$gamma
assigned_2d <- apply(gamma_2d, 1, which.max)

# Create confusion matrix to re-label if necessary
cm <- table(true_labels_2d, assigned_2d)
if (cm[1,1] + cm[2,2] < cm[1,2] + cm[2,1]) {
  assigned_2d <- ifelse(assigned_2d == 1, 2, 1)
}
df_2d$assigned <- factor(assigned_2d)

# Plot the final clustering (after EM)
ggplot(df_2d, aes(x, y, color = assigned)) +
  geom_point() +
  theme_minimal() +
  ggtitle("2D Gaussian Mixture - EM Assigned Clusters")

# --- 5a) Visualization: Gamma Evolution (During EM) --- #

# Plot the evolution of the probability (gamma) for cluster 1
plots_gamma <- list()
for (i in 1:length(res_2d$gamma_history)) {
  gamma_iter <- res_2d$gamma_history[[i]]
  df_gamma <- data.frame(x = data_2d[,1], y = data_2d[,2], prob_cluster1 = gamma_iter[,1])
  
  p_gamma <- ggplot(df_gamma, aes(x = x, y = y, color = prob_cluster1)) +
    geom_point() +
    scale_color_gradient(low = "blue", high = "red") +
    ggtitle(paste("Gamma Evolution - Iteration", i)) +
    theme_minimal()
  
  plots_gamma[[i]] <- p_gamma
}

# Plot gamma evolution in a grid (separately)
grid.arrange(grobs = plots_gamma, ncol = 4, top = "Gamma Evolution (Probability for Cluster 1)")

# --- 5b) Visualization: Parameter Evolution (During EM) --- #

# Plot the evolution of the estimated means and covariance ellipses
plots_params <- list()
for (i in 1:length(res_2d$mu_history)) {
  current_mu <- res_2d$mu_history[[i]]
  current_sigma <- res_2d$sigma_history[[i]]
  
  p_param <- ggplot(df_2d, aes(x = x, y = y)) +
    geom_point(alpha = 0.5, color = "gray") +
    ggtitle(paste("Parameter Evolution - Iteration", i)) +
    theme_minimal()
  
  for (k in 1:K) {
    ellipse_points <- as.data.frame(ellipse(current_sigma[[k]], centre = current_mu[k,], level = 0.95))
    p_param <- p_param +
      geom_path(data = ellipse_points, aes(x = x, y = y),
                color = ifelse(k == 1, "red", "blue"), size = 1) +
      geom_point(aes(x = current_mu[k, 1], y = current_mu[k, 2]),
                 color = ifelse(k == 1, "red", "blue"), size = 3)
  }
  plots_params[[i]] <- p_param
}

# Plot parameter evolution in a grid (separately)
grid.arrange(grobs = plots_params, ncol = 4, top = "Parameter Evolution (Means & Covariance Ellipses)")

# --- 3D Data Generation ---
set.seed(563)
d <- 3
K <- 2
n <- 300
data_info_3d <- generate_gaussian_mixture_data(n, d, K)
data_3d <- data_info_3d$data    # (n x 3)
true_labels_3d <- data_info_3d$z
df_3d <- data.frame(x = data_3d[,1], y = data_3d[,2], z = data_3d[,3],
                    cluster = factor(true_labels_3d))

# --- Visualization: Before EM (True Labels) ---
p_before <- plot_ly(df_3d, x = ~x, y = ~y, z = ~z, color = ~cluster,
                    colors = c("red", "blue"), type = "scatter3d", mode = "markers") %>% 
  layout(title = "Generated 3D Data (True Labels)")
p_before
# --- Run EM Algorithm ---
init_3d <- initialize_values(data_3d, nrow(data_3d), d, K)
res_3d <- EM_algorithm(data_3d, init_3d$Pis, init_3d$Mus, init_3d$Sigmas, tol = 1e-4, verbose = TRUE)
## Initial log-likelihood:  -2573.486 
## Iteration:  1  | Current log-likelihood:  -2237.384 
## Iteration:  2  | Current log-likelihood:  -2196.989 
## Iteration:  3  | Current log-likelihood:  -2173.488 
## Iteration:  4  | Current log-likelihood:  -2146.89 
## Iteration:  5  | Current log-likelihood:  -2096.955 
## Iteration:  6  | Current log-likelihood:  -2020.971 
## Iteration:  7  | Current log-likelihood:  -1974.299 
## Iteration:  8  | Current log-likelihood:  -1968.148 
## Iteration:  9  | Current log-likelihood:  -1967.215 
## Iteration:  10  | Current log-likelihood:  -1967.092 
## Iteration:  11  | Current log-likelihood:  -1967.105 
## Iteration:  12  | Current log-likelihood:  -1967.126 
## Iteration:  13  | Current log-likelihood:  -1967.14 
## Iteration:  14  | Current log-likelihood:  -1967.147 
## Iteration:  15  | Current log-likelihood:  -1967.15 
## Iteration:  16  | Current log-likelihood:  -1967.152 
## Iteration:  17  | Current log-likelihood:  -1967.152 
## Iteration:  18  | Current log-likelihood:  -1967.153 
## Iteration:  19  | Current log-likelihood:  -1967.153 
## Iteration:  20  | Current log-likelihood:  -1967.153
# --- Final Clustering: After EM ---
gamma_3d <- res_3d$gamma
assigned_3d <- apply(gamma_3d, 1, which.max)
df_3d$assigned <- factor(assigned_3d)
p_after <- plot_ly(df_3d, x = ~x, y = ~y, z = ~z, color = ~assigned,
                   colors = c("red", "blue"), type = "scatter3d", mode = "markers") %>% 
  layout(title = "Final Clustering after EM (3D)")
p_after
# --- 3a) Gamma Evolution Visualization (During EM) ---
gamma_plots_3d <- list()
for (i in 1:length(res_3d$gamma_history)) {
  gamma_iter <- res_3d$gamma_history[[i]]
  df_gamma <- data.frame(x = data_3d[,1], y = data_3d[,2], z = data_3d[,3],
                         prob_cluster1 = gamma_iter[,1])
  p_gamma <- plot_ly(df_gamma, x = ~x, y = ~y, z = ~z, 
                     color = ~prob_cluster1, colors = "RdBu", type = "scatter3d", mode = "markers") %>%
    layout(title = paste("Gamma Evolution - Iteration", i))
  gamma_plots_3d[[i]] <- p_gamma
}
# Display the gamma evolution plots separately (for example, display the first one)
gamma_plots_3d[[1]]
gamma_plots_3d[[2]]
gamma_plots_3d[[3]]
# For generating ellipsoidal points

# --- Helper Function: Generate Ellipsoid Points in 3D ---
ellipsoid_points <- function(mu, sigma, level = 0.95, n = 20) {
  # Compute the scaling constant from the chi-square distribution with 3 degrees of freedom
  c_val <- qchisq(level, df = 3)
  # Create a grid in spherical coordinates
  u <- seq(0, 2*pi, length.out = n)
  v <- seq(0, pi, length.out = n)
  grid <- expand.grid(u = u, v = v)
  x <- cos(grid$u) * sin(grid$v)
  y <- sin(grid$u) * sin(grid$v)
  z <- cos(grid$v)
  pts <- cbind(x, y, z) * sqrt(c_val)
  # Map unit sphere to ellipsoid via eigen-decomposition of sigma
  eig <- eigen(sigma)
  ellip <- t(eig$vectors %*% diag(sqrt(eig$values)) %*% t(pts))
  ellip <- sweep(ellip, 2, mu, "+")
  ellip_df <- data.frame(x = ellip[,1], y = ellip[,2], z = ellip[,3])
  return(ellip_df)
}


# --- 3b) Parameter Evolution Visualization (During EM) ---
param_plots_3d <- list()
for (i in 1:length(res_3d$mu_history)) {
  current_mu <- res_3d$mu_history[[i]]
  current_sigma <- res_3d$sigma_history[[i]]
  
  # Base plot: data in grey
  p_param <- plot_ly(x = data_3d[,1], y = data_3d[,2], z = data_3d[,3],
                     type = "scatter3d", mode = "markers",
                     marker = list(color = "gray", size = 2)) %>%
    layout(title = paste("Parameter Evolution - Iteration", i))
  
  # Add each cluster's mean and ellipsoid
  for (k in 1:K) {
    # Add mean as a marker
    p_param <- p_param %>% add_markers(x = current_mu[k, 1], y = current_mu[k, 2], z = current_mu[k, 3],
                                       marker = list(color = ifelse(k == 1, "red", "blue"), size = 6),
                                       name = paste("Mean", k))
    # Compute ellipsoid points and add as semi-transparent markers
    ellip_df <- ellipsoid_points(current_mu[k, ], current_sigma[[k]], level = 0.95, n = 20)
    p_param <- p_param %>% add_markers(data = ellip_df, x = ~x, y = ~y, z = ~z,
                                       marker = list(color = ifelse(k == 1, "red", "blue"),
                                                     size = 2, opacity = 0.3),
                                       showlegend = FALSE)
  }
  param_plots_3d[[i]] <- p_param
}
# Display the parameter evolution plot for the first iteration
param_plots_3d[[1]]
param_plots_3d[[5]]